library(readr)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ dplyr 1.0.8
✓ tibble 3.1.6 ✓ stringr 1.4.0
✓ tidyr 1.1.4 ✓ forcats 0.5.1
✓ purrr 0.3.4
── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(gridExtra)
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
library(ggpubr)
library(ggplot2)
library(gganimate)
library(gifski)
CBB = read_csv("cbb.csv")
Rows: 2455 Columns: 24
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): TEAM, CONF, POSTSEASON
dbl (21): G, W, ADJOE, ADJDE, BARTHAG, EFG_O, EFG_D, TOR, TORD, ORB, DRB, FTR, FTRD, 2P_O, 2P_D, 3P_O, 3P_D, ADJ_T,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(CBB)
[1] "TEAM" "CONF" "G" "W" "ADJOE" "ADJDE" "BARTHAG" "EFG_O"
[9] "EFG_D" "TOR" "TORD" "ORB" "DRB" "FTR" "FTRD" "2P_O"
[17] "2P_D" "3P_O" "3P_D" "ADJ_T" "WAB" "POSTSEASON" "SEED" "YEAR"
CBB <- CBB %>%
rename(School=TEAM) %>%
rename(Conference=CONF) %>%
rename(GamesPlayed = G) %>%
rename(GamesWon = W) %>%
rename(AdjustedOffensiveEfficiency=ADJOE) %>%
rename(AdjustedDefensiveEfficiency=ADJDE) %>%
rename(PowerRating = BARTHAG) %>%
rename(EffectiveFieldGoalPercentageShot=EFG_O) %>%
rename(EffectiveFieldGoalPercentageAllowed=EFG_D) %>%
rename(TurnoverRate=TOR) %>%
rename(StealRate=TORD) %>%
rename(OffensiveReboundRate=ORB) %>%
rename(OffensiveReboundRateAllowed=DRB) %>%
rename(FreeThrowRate=FTR) %>%
rename(FreeThrowRateAllowed = FTRD) %>%
rename(TwoPointShootingPercentage = "2P_O") %>%
rename(TwoPointShootingPercentageAllowed='2P_D') %>%
rename(ThreePointShootingPercentage='3P_O') %>%
rename(ThreePointShootingPercentageAllowed='3P_D') %>%
rename(AdjustedTempo=ADJ_T) %>%
rename(WinsAboveBubble=WAB) %>%
rename(Postseason=POSTSEASON) %>%
rename(Seed=SEED) %>%
rename(Season=YEAR)
q1 <- filter(CBB, !is.na(Postseason)) %>%
mutate(WinningPercentage = GamesWon / GamesPlayed)
q1$Postseason[q1$Postseason == "Champions"] <- 8
q1$Postseason[q1$Postseason == "2ND"] <- 7
q1$Postseason[q1$Postseason == "F4"] <- 6
q1$Postseason[q1$Postseason == "E8"] <- 5
q1$Postseason[q1$Postseason == "S16"] <- 4
q1$Postseason[q1$Postseason == "R32"] <- 3
q1$Postseason[q1$Postseason == "R64"] <- 2
q1$Postseason[q1$Postseason == "R68"] <- 1
q1b = q1[,c(3, 4, 25, 5:24)]
none = lm(Postseason~1, data=q1b)
full = lm(Postseason~., data=q1b)
stepAIC(none, scope=list(upper=full), direction="both", trace=FALSE)
Error in stepAIC(none, scope = list(upper = full), direction = "both", :
could not find function "stepAIC"
plot(Postseason ~ GamesPlayed + GamesWon + WinningPercentage + PowerRating + AdjustedTempo + OffensiveReboundRateAllowed + FreeThrowRate, data = q1b)
q1_model = lm(Postseason ~ GamesPlayed + GamesWon + WinningPercentage + PowerRating + AdjustedTempo + OffensiveReboundRateAllowed + FreeThrowRate, data = q1b)
abline(q1_model)
Warning in abline(q1_model) :
only using the first two of 8 regression coefficients
q2 <- q1[c(2,22)]
q2$Conference <- as.factor(q2$Conference)
q2$Postseason <- as.numeric(q2$Postseason)
boxplot(Postseason~factor(Conference), data=q2)
means = tapply(q2$Postseason, q2$Conference, mean)
points(means, col="red", pch=18)
q2_anova = aov(Postseason~factor(Conference), data=q2)
q2_anova
Call:
aov(formula = Postseason ~ factor(Conference), data = q2)
Terms:
factor(Conference) Residuals
Sum of Squares 205.2532 703.4086
Deg. of Freedom 31 444
Residual standard error: 1.258671
Estimated effects may be unbalanced
plot(q2_anova)
# q2_anova plot satisfies criteria
summary(q2_anova)
Df Sum Sq Mean Sq F value Pr(>F)
factor(Conference) 31 205.3 6.621 4.179 5.83e-12 ***
Residuals 444 703.4 1.584
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# P-value is low -> reject the null (that all means are the same), which means that the groups are different
# See which groups are the most different
pairwise.t.test(q2$Postseason, q2$Conference, p.adj="none")
q2_plot = TukeyHSD(q2_anova)
plot(q2_plot)
q3<- ggplot(data=CBB) +
geom_point(aes(x=TurnoverRate,y=GamesWon)) +
geom_smooth(aes(x=TurnoverRate,y=GamesWon)) +
xlab("Turnover Rate")+ylab("Games Won")
q3
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
q4<-ggplot(data=filter(CBB,Conference=="ACC")) +
geom_point(aes(x=PowerRating,y= GamesWon/GamesPlayed))+
geom_smooth(aes(x=PowerRating,y= GamesWon/GamesPlayed))+
ggtitle("Power Rating Efficiency in the ACC")+
xlab("Power Rating") +ylab("Average Win")
q4
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
q5 = ggplot(data=CBB) +
geom_point(aes(x=FreeThrowRate,y=GamesWon)) +
xlab("Free Throw Percentage")+ylab("Games Won") + geom_smooth(aes(x=FreeThrowRate,y=GamesWon))
q5
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
q6 = ggplot(data=CBB) +
geom_point(aes(x=AdjustedOffensiveEfficiency,y= GamesWon / GamesPlayed)) +
xlab("Offensive Efficiency")+ylab("Winning Percentage") + geom_smooth(aes(x=AdjustedOffensiveEfficiency,y= GamesWon / GamesPlayed))
q6
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Plots for q7
q7plot.1 = ggplot(data=CBB, aes(x=AdjustedOffensiveEfficiency, y=PowerRating)) +
geom_point()
q8plot.2 = ggplot(data=CBB, aes(x=AdjustedDefensiveEfficiency, y=PowerRating)) +
geom_point()
grid.arrange(q7plot.1, q8plot.2, ncol=2)
#Plot for q8
q8plot.1 = CBB %>%
group_by(Conference) %>%
summarise(Count = n(), GamesWonAvg = mean(GamesWon)) %>%
arrange(desc(Count)) %>%
ggplot(aes(x=Conference, y=GamesWonAvg)) +
geom_bar(stat='identity')
q8plot.1
#Plot for q9
q9 <- ggplot(data=CBB, aes(x=TwoPointShootingPercentageAllowed,y=ThreePointShootingPercentageAllowed)) +
geom_point()+
stat_regline_equation(label.y = 40, aes(label = ..rr.label..))
q9
#Plot for q10
q10.1 <- ggplot(data=CBB, aes(x=ThreePointShootingPercentage,y=AdjustedOffensiveEfficiency)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
stat_regline_equation(label.y = 120, aes(label = ..rr.label..))
q10.2 <- ggplot(data=CBB, aes(x=TwoPointShootingPercentage, y=AdjustedOffensiveEfficiency)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
stat_regline_equation(label.y = 120, aes(label = ..rr.label..))
grid.arrange(q10.1, q10.2, ncol=1)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
q11<-ggplot(data=CBB) +
geom_point(aes(x=ThreePointShootingPercentage,y= GamesWon))+
geom_smooth(aes(x=ThreePointShootingPercentage,y= GamesWon))+
ggtitle("Correlation Between 3P Shooting Percentage and Games Won")+
xlab("3P Shooting %") +ylab("Games Won")
q11
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
q12<-ggplot(data=CBB) +
geom_point(aes(x=TwoPointShootingPercentageAllowed,y= FreeThrowRateAllowed))+
geom_smooth(aes(x=TwoPointShootingPercentageAllowed,y= FreeThrowRateAllowed))+
ggtitle("Correlation Between Allowed 2pt Shooting % and Free Throw Rate Allowed")+
xlab("2pt Shot Shooting Percentage Allowed") +ylab("Free Throw Rate Alowed")
q12
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
<<<<<<< Updated upstream Possible graph for question 4: Do more wins over bubble teams lead to higher seeds in March Madness?
Bubblewins = ggplot(data=CBB) +
geom_point(aes(x=WinsAboveBubble,y=Seed, color = Conference)) +
xlab("Wins Above Bubble") + ylab("Seed") + geom_smooth(aes(x=WinsAboveBubble, y = Seed))
Bubblewins
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
=======
#Code for follow-up q3
ggplot(q1, aes(x=Season, y=AdjustedOffensiveEfficiency, group=Conference, color=Conference)) +
geom_line()
#transition_reveal(Season)
Stashed changes
mod1 = lm(AdjustedOffensiveEfficiency~Season+Conference, data=q1)
summary(mod1)
Call:
lm(formula = AdjustedOffensiveEfficiency ~ Season + Conference,
data = q1)
Residuals:
Min 1Q Median 3Q Max
-11.7993 -2.9730 -0.0997 3.0028 13.8258
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -310.4006 211.9532 -1.464 0.143773
Season 0.2086 0.1052 1.984 0.047902 *
ConferenceACC 6.3718 1.1199 5.690 2.32e-08 ***
ConferenceAE -5.5240 1.9345 -2.855 0.004499 **
ConferenceAmer 2.2932 1.3812 1.660 0.097569 .
ConferenceASun -4.7269 2.0556 -2.300 0.021938 *
ConferenceB10 5.3016 1.1251 4.712 3.29e-06 ***
ConferenceB12 4.9407 1.1299 4.373 1.53e-05 ***
ConferenceBE 4.0903 1.1536 3.546 0.000433 ***
ConferenceBSky -5.1240 1.9345 -2.649 0.008369 **
ConferenceBSth -7.0219 1.8395 -3.817 0.000154 ***
ConferenceBW -6.6240 1.9345 -3.424 0.000674 ***
ConferenceCAA -1.7097 1.9345 -0.884 0.377283
ConferenceCUSA -4.8097 1.9345 -2.486 0.013277 *
ConferenceHorz -4.9383 1.9345 -2.553 0.011023 *
ConferenceIvy -3.0383 1.9345 -1.571 0.116998
ConferenceMAAC -2.8097 1.9345 -1.452 0.147095
ConferenceMAC -2.3955 1.9345 -1.238 0.216276
ConferenceMEAC -12.1955 1.9345 -6.304 7.02e-10 ***
ConferenceMVC 2.0923 1.6919 1.237 0.216889
ConferenceMWC -0.2815 1.4476 -0.194 0.845912
ConferenceNEC -6.7240 1.9345 -3.476 0.000560 ***
ConferenceOVC -1.0844 1.8395 -0.590 0.555826
ConferenceP12 2.8533 1.2072 2.364 0.018527 *
ConferencePat -4.6955 1.9345 -2.427 0.015615 *
ConferenceSB -3.0029 1.8368 -1.635 0.102790
ConferenceSC -3.7097 1.9345 -1.918 0.055799 .
ConferenceSEC 4.3603 1.1985 3.638 0.000307 ***
ConferenceSlnd -4.1240 1.9345 -2.132 0.033574 *
ConferenceSum -1.1955 1.9345 -0.618 0.536922
ConferenceSWAC -8.8669 1.9345 -4.583 5.96e-06 ***
ConferenceWAC -3.1240 1.9345 -1.615 0.107050
ConferenceWCC 6.9869 1.5886 4.398 1.37e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.522 on 443 degrees of freedom
Multiple R-squared: 0.5283, Adjusted R-squared: 0.4942
F-statistic: 15.5 on 32 and 443 DF, p-value: < 2.2e-16
plot(AdjustedOffensiveEfficiency~Season+Conference, data=q1)
Warning in xy.coords(x, y, xlabel, ylabel, log) :
NAs introduced by coercion
Warning in min(x) : no non-missing arguments to min; returning Inf
Warning in max(x) : no non-missing arguments to max; returning -Inf
Error in plot.window(...) : need finite 'xlim' values
#Figures for 2nd Final Question